Question: How many football matches the students watch per week?
import numpy as np
matches = np.array([3, 6, 0, 5, 0, 0,0,2,1,0,0,0,0,0,2, 0, 1, 0, 0, 0, 0, 0,0, 0, 0, 0,0,4,4,0,2,0,0,0,4])
matches.mean()
import numpy as np
import pandas as pd
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],[np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data
data.isna().sum()
# Drop all rows that have any missing value
data.dropna()
# Passing how="all" will drop only rows that are all NA:
data.dropna(how="all")
# Drop on column level
data.dropna(axis="columns")
# Create dataframe
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[2:3, 1] = np.nan
df.iloc[4:5, 1] = np.nan
df.iloc[4:5, 2] = np.nan
df.iloc[0,1]= np.nan
df.iloc[-1,2]= np.nan
df
# Fill with constant value
df.fillna(0)
# Fill with mean/median/max/min...
df.fillna(df.mean())
# ‘forward fill’: propagate last valid observation forward
df.fillna(method="ffill")
# backward fill
df.fillna(method="bfill")
import pandas as pd
df = pd.read_csv("online_retail_II 2.csv")
df['Total'] = df['Quantity'] * df['Price']
df['Tax 16%'] = df['Total'] * 0.16
df['Gross'] = df['Total'] + df['Tax 16%']
df['Payment Method'] = 'Credit Card'
df.loc[df.sample(1543).index, ['Payment Method'] ] = "Cash"
df = df[df['Total'] > 0 ]
df = df[df['Customer ID'].notna()]
import random
df.loc[df.sample(8872).index, 'Price'] = np.nan # Calculated feature
df.loc[df.sample(1245).index, 'Customer ID'] = np.nan # Infer from invoice
df.loc[df.sample(4537).index, ['Quantity', 'Price'] ] = np.nan # group mean (StockCode or Description)
df.loc[df.sample(326).index, ['Payment Method'] ] = np.nan # most frequent
df.loc[df.sample(26).index, ['Quantity', 'Price', 'StockCode', 'Description'] ] = np.nan # drop
df = df[['Invoice', 'StockCode', 'Description', 'Quantity', 'Price', 'Total', 'Tax 16%', 'Gross', 'InvoiceDate', 'Payment Method', 'Customer ID', 'Country']]
df.to_csv("online_retail_II 2_noisy.csv", index=False)
import pandas as pd
df = pd.read_csv("online_retail_II 2_noisy.csv")
df.head(8)
| Invoice | StockCode | Description | Quantity | Price | Total | Tax 16% | Gross | InvoiceDate | Payment Method | Customer ID | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12.0 | 6.95 | 83.4 | 13.344 | 96.744 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom |
| 1 | 489434 | 79323P | PINK CHERRY LIGHTS | 12.0 | 6.75 | 81.0 | 12.960 | 93.960 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom |
| 2 | 489434 | 79323W | WHITE CHERRY LIGHTS | 12.0 | 6.75 | 81.0 | 12.960 | 93.960 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom |
| 3 | 489434 | 22041 | RECORD FRAME 7" SINGLE SIZE | 48.0 | 2.10 | 100.8 | 16.128 | 116.928 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom |
| 4 | 489434 | 21232 | STRAWBERRY CERAMIC TRINKET BOX | 24.0 | 1.25 | 30.0 | 4.800 | 34.800 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom |
| 5 | 489434 | 22064 | PINK DOUGHNUT TRINKET POT | 24.0 | 1.65 | 39.6 | 6.336 | 45.936 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom |
| 6 | 489434 | 21871 | SAVE THE PLANET MUG | 24.0 | 1.25 | 30.0 | 4.800 | 34.800 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom |
| 7 | 489434 | 21523 | FANCY FONT HOME SWEET HOME DOORMAT | 10.0 | 5.95 | 59.5 | 9.520 | 69.020 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom |
df.isna().sum()
Invoice 0 StockCode 26 Description 26 Quantity 4563 Price 13390 Total 0 Tax 16% 0 Gross 0 InvoiceDate 0 Payment Method 326 Customer ID 1245 Country 0 dtype: int64
df[df['Price'].isna()]
| Invoice | StockCode | Description | Quantity | Price | Total | Tax 16% | Gross | InvoiceDate | Payment Method | Customer ID | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11 | 489435 | 22353 | LUNCHBOX WITH CUTLERY FAIRY CAKES | NaN | NaN | 30.60 | 4.8960 | 35.4960 | 2009-12-01 07:46:00 | Credit Card | 13085.0 | United Kingdom |
| 47 | 489437 | 20971 | PINK BLUE FELT CRAFT TRINKET BOX | NaN | NaN | 15.00 | 2.4000 | 17.4000 | 2009-12-01 09:08:00 | Credit Card | 15362.0 | United Kingdom |
| 52 | 489437 | 22111 | SCOTTIE DOG HOT WATER BOTTLE | NaN | NaN | 14.85 | 2.3760 | 17.2260 | 2009-12-01 09:08:00 | Credit Card | 15362.0 | United Kingdom |
| 60 | 489438 | 21411 | GINGHAM HEART DOORSTOP RED | 32.0 | NaN | 80.00 | 12.8000 | 92.8000 | 2009-12-01 09:24:00 | Credit Card | 18102.0 | United Kingdom |
| 77 | 489439 | 16161P | WRAP ENGLISH ROSE | 25.0 | NaN | 10.50 | 1.6800 | 12.1800 | 2009-12-01 09:28:00 | Credit Card | 12682.0 | France |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 805151 | 581496 | 22112 | CHOCOLATE HOT WATER BOTTLE | NaN | NaN | 29.70 | 4.7520 | 34.4520 | 2011-12-09 10:20:00 | Credit Card | 16558.0 | United Kingdom |
| 805162 | 581496 | 22190 | LOCAL CAFE MUG | 24.0 | NaN | 9.36 | 1.4976 | 10.8576 | 2011-12-09 10:20:00 | Credit Card | 16558.0 | United Kingdom |
| 805189 | 581501 | 21564 | PINK HEART SHAPE LOVE BUCKET | 24.0 | NaN | 18.96 | 3.0336 | 21.9936 | 2011-12-09 10:46:00 | Credit Card | 12985.0 | United Kingdom |
| 805346 | 581567 | 22464 | HANGING METAL HEART LANTERN | 24.0 | NaN | 18.96 | 3.0336 | 21.9936 | 2011-12-09 11:56:00 | Credit Card | 16626.0 | United Kingdom |
| 805514 | 581585 | 84879 | ASSORTED COLOUR BIRD ORNAMENT | 16.0 | NaN | 27.04 | 4.3264 | 31.3664 | 2011-12-09 12:31:00 | Credit Card | 15804.0 | United Kingdom |
13390 rows × 12 columns
def find_na(inv):
return inv['Customer ID'].isna().any() & inv['Customer ID'].notna().any()
mixed = df.groupby(['Invoice']).apply(find_na).to_frame()
mixed
| 0 | |
|---|---|
| Invoice | |
| 489434 | False |
| 489435 | False |
| 489436 | False |
| 489437 | False |
| 489438 | False |
| ... | ... |
| 581583 | False |
| 581584 | False |
| 581585 | False |
| 581586 | False |
| 581587 | False |
36969 rows × 1 columns
df['Price'] = df['Price'].fillna(df['Total'] / df['Quantity'])
df['Quantity'] = df['Quantity'].groupby(df["StockCode"]).transform(lambda x: x.fillna(x.mean()))
df['Customer ID'] = df.groupby('Invoice').apply(lambda x: x['Customer ID'].fillna(x['Customer ID'].values[0])).reset_index('Invoice', drop=True)
df
| Invoice | StockCode | Description | Quantity | Price | Total | Tax 16% | Gross | InvoiceDate | Payment Method | Customer ID | Country | Quantity_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12.0 | 6.95 | 83.40 | 13.344 | 96.744 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom | 12.0 |
| 1 | 489434 | 79323P | PINK CHERRY LIGHTS | 12.0 | 6.75 | 81.00 | 12.960 | 93.960 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom | 12.0 |
| 2 | 489434 | 79323W | WHITE CHERRY LIGHTS | 12.0 | 6.75 | 81.00 | 12.960 | 93.960 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom | 12.0 |
| 3 | 489434 | 22041 | RECORD FRAME 7" SINGLE SIZE | 48.0 | 2.10 | 100.80 | 16.128 | 116.928 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom | 48.0 |
| 4 | 489434 | 21232 | STRAWBERRY CERAMIC TRINKET BOX | 24.0 | 1.25 | 30.00 | 4.800 | 34.800 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom | 24.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 805544 | 581587 | 22899 | CHILDREN'S APRON DOLLY GIRL | 6.0 | 2.10 | 12.60 | 2.016 | 14.616 | 2011-12-09 12:50:00 | Credit Card | 12680.0 | France | 6.0 |
| 805545 | 581587 | 23254 | CHILDRENS CUTLERY DOLLY GIRL | 4.0 | 4.15 | 16.60 | 2.656 | 19.256 | 2011-12-09 12:50:00 | Credit Card | 12680.0 | France | 4.0 |
| 805546 | 581587 | 23255 | CHILDRENS CUTLERY CIRCUS PARADE | 4.0 | 4.15 | 16.60 | 2.656 | 19.256 | 2011-12-09 12:50:00 | Credit Card | 12680.0 | France | 4.0 |
| 805547 | 581587 | 22138 | BAKING SET 9 PIECE RETROSPOT | 3.0 | 4.95 | 14.85 | 2.376 | 17.226 | 2011-12-09 12:50:00 | Credit Card | 12680.0 | France | 3.0 |
| 805548 | 581587 | POST | POSTAGE | 1.0 | 18.00 | 18.00 | 2.880 | 20.880 | 2011-12-09 12:50:00 | Credit Card | 12680.0 | France | 1.0 |
805549 rows × 13 columns
def fill_price_by_stock_code(group):
v = group.dropna().iloc[0]
return group.fillna(v)
df['Price'] = df['Price'].groupby(df['StockCode']).transform(fill_price_by_stock_code)
import numpy as np
def find_customer(group):
rows = group.dropna()
v = rows.iloc[0] if rows.shape[0] > 0 else np.nan
return group.fillna(v)
df['Payment Method'] = df['Payment Method'].groupby(df['Invoice']).transform(find_customer)
df.isna().sum()
Invoice 0 StockCode 26 Description 26 Quantity 26 Price 26 Total 0 Tax 16% 0 Gross 0 InvoiceDate 0 Payment Method 1 Customer ID 62 Country 0 Quantity_2 26 dtype: int64
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "k2": [1, 1, 2, 3, 3, 4, 4]})
data
#Check for duplicates
data.duplicated()
#Drop duplicates
data.drop_duplicates() # returns a DataFrame with rows where the duplicated array is False filtered out
data["v1"] = range(7)
data
#Drop with parameters
data.drop_duplicates(["k1", "k2"], keep="last")
df = pd.DataFrame(
[('carrot', 'red'),
('potato', 'yellow'),
('mango', 'yellow'),
('apple', 'red')
],
columns=['species', 'color']
)
df
mappings = {
'carrot': 'veg',
'potato': 'vegetables',
'mango':'Fruit',
'apple':'fruit'
}
df['type_name'] = df['species'].map(mappings)
df
# df['type_name'].replace
df['type_name'].value_counts()
df['type_name'] = df['type_name'].replace({'veg':'vegetables','Fruit':'fruit'})
df['type_name'].value_counts()
Outliers are values that are dramatically different than the other values




df = pd.read_csv("online_retail_II 2.csv")
df
df['price_zscore'] = (df['Price'] - df['Price'].mean()) / df['Price'].std()
df['outlier'] = df['price_zscore'].apply(lambda x: abs(x) >= 3)
df[df['outlier'] == True]
val = "a,b, guido"
#Split
val.split(",")
#Trim
pieces = [x.strip() for x in val.split(",")]
pieces
first, second, third = pieces
#Concatenate
first + "::" + second + "::" + third
#Concatenate using join
"::".join(pieces)
#search in string
"guido" in val
#search using find and index
val.find(":")
# val.index(":")
# Count
val.count(",")
#Replace string
val.replace(",", "::")

text = "foo bar\t baz \tqux"
We need to split the text by whitespace characters (tabs, spaces, and newlines).
import re
re.split("\s+", text)
['foo', 'bar', 'baz', 'qux']
#when applying the same regex several times
regex = re.compile(r'\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
text = """Dave dave@google.com
Steve Steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""
text
'Dave dave@google.com\n Steve Steve@gmail.com\n Rob rob@gmail.com\n Ryan ryan@yahoo.com'
Retrive all email addresses mentioned on the text
pattern = r"[a-z]+@[a-z]+.[a-z]+"
re.findall(pattern, text, flags=re.IGNORECASE)
['dave@google.com', 'Steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
m = regex.search(text)
m

#Search for all "LIGHT" items
df[df['Description'].str.match(".*LIGHT.*") == True]
| Invoice | StockCode | Description | Quantity | Price | Total | Tax 16% | Gross | InvoiceDate | Payment Method | Customer ID | Country | Quantity_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12.0 | 6.95 | 83.40 | 13.3440 | 96.7440 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom | 12.0 |
| 1 | 489434 | 79323P | PINK CHERRY LIGHTS | 12.0 | 6.75 | 81.00 | 12.9600 | 93.9600 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom | 12.0 |
| 2 | 489434 | 79323W | WHITE CHERRY LIGHTS | 12.0 | 6.75 | 81.00 | 12.9600 | 93.9600 | 2009-12-01 07:45:00 | Credit Card | 13085.0 | United Kingdom | 12.0 |
| 29 | 489436 | 84596L | BISCUITS SMALL BOWL LIGHT BLUE | 8.0 | 1.25 | 10.00 | 1.6000 | 11.6000 | 2009-12-01 09:06:00 | Credit Card | 13078.0 | United Kingdom | 8.0 |
| 46 | 489437 | 84970S | HANGING HEART ZINC T-LIGHT HOLDER | 12.0 | 0.85 | 10.20 | 1.6320 | 11.8320 | 2009-12-01 09:08:00 | Credit Card | 15362.0 | United Kingdom | 12.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 805513 | 581585 | 23084 | RABBIT NIGHT LIGHT | 12.0 | 2.08 | 24.96 | 3.9936 | 28.9536 | 2011-12-09 12:31:00 | Credit Card | 15804.0 | United Kingdom | 12.0 |
| 805515 | 581585 | 84945 | MULTI COLOUR SILVER T-LIGHT HOLDER | 24.0 | 0.85 | 20.40 | 3.2640 | 23.6640 | 2011-12-09 12:31:00 | Credit Card | 15804.0 | United Kingdom | 24.0 |
| 805523 | 581585 | 84946 | ANTIQUE SILVER T-LIGHT GLASS | 12.0 | 1.25 | 15.00 | 2.4000 | 17.4000 | 2011-12-09 12:31:00 | Credit Card | 15804.0 | United Kingdom | 12.0 |
| 805527 | 581585 | 23145 | ZINC T-LIGHT HOLDER STAR LARGE | 12.0 | 0.95 | 11.40 | 1.8240 | 13.2240 | 2011-12-09 12:31:00 | Credit Card | 15804.0 | United Kingdom | 12.0 |
| 805528 | 581585 | 22466 | FAIRY TALE COTTAGE NIGHT LIGHT | 12.0 | 1.95 | 23.40 | 3.7440 | 27.1440 | 2011-12-09 12:31:00 | Credit Card | 15804.0 | United Kingdom | 12.0 |
39516 rows × 13 columns